{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Copyright (c) 2018,  Bosch Software Innovations GmbH.\n",
    "\n",
    "    Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "    you may not use this file except in compliance with the License.\n",
    "    You may obtain a copy of the License at\n",
    "\n",
    "        http://www.apache.org/licenses/LICENSE-2.0\n",
    "\n",
    "    Unless required by applicable law or agreed to in writing, software\n",
    "    distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "    See the License for the specific language governing permissions and\n",
    "    limitations under the License."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import of all the basic modules ([pandas](https://pandas.pydata.org/)). See [plotting](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html) for help on charts."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Common Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE_PATH = os.path.join('build', 'bin')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Common Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_csv(filepath, columns = None):\n",
    "    dataframe = pd.read_csv(os.path.join(BASE_PATH, filepath))\n",
    "    if columns is not None:\n",
    "        assert set(dataframe.columns) == set(columns)\n",
    "    return dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calc_duration(dataframe, start_column, end_column, target_column):\n",
    "    dataframe[target_column] = dataframe.apply(lambda row: row[end_column] - row[start_column], axis=1)\n",
    "    dataframe.drop([start_column, end_column], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def delete_column(dataframe, column):\n",
    "    dataframe.drop([column], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def duration(dataframe, start_column, end_column, target_column):\n",
    "    dataframe[target_column] = dataframe.apply(lambda row: row[end_column] - row[start_column], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scale_value(dataframe, source_column, target_column, factor, ndigits=None):\n",
    "    dataframe[target_column] = dataframe.apply(lambda row: factor * row[source_column], axis=1)\n",
    "    if ndigits is not None:\n",
    "        dataframe[target_column] = dataframe.apply(lambda row: round(row[target_column], ndigits), axis=1)\n",
    "    if source_column != target_column:\n",
    "        dataframe.drop([source_column], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def progress_column(subject):\n",
    "    return lambda i: subject + '_' + str(i) + ' (ms)'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "percent = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]\n",
    "def calc_write_progress(dataframe, progress_column, target_column):\n",
    "    for i in percent:\n",
    "        duration(dataframe, progress_column(i-10), progress_column(i), target_column(i))\n",
    "    for i in [0] + percent:\n",
    "        delete_column(dataframe, progress_column(i))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Benchmarks"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1) different SQLite modi: performance vs. \"stability\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#data1_columns = None\n",
    "#data1 = read_csv('sqlite-db-settings-benchmark.csv', data1_columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2) small messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2 = read_csv('small_messages_benchmark.csv', [\n",
    "    'description',\n",
    "    'number of messages',\n",
    "    'message blob size (bytes)',\n",
    "    'transaction size',\n",
    "    'start writing time (ms)',\n",
    "    'write_throughput_0 (ms)',\n",
    "    'write_throughput_10 (ms)',\n",
    "    'write_throughput_20 (ms)',\n",
    "    'write_throughput_30 (ms)',\n",
    "    'write_throughput_40 (ms)',\n",
    "    'write_throughput_50 (ms)',\n",
    "    'write_throughput_60 (ms)',\n",
    "    'write_throughput_70 (ms)',\n",
    "    'write_throughput_80 (ms)',\n",
    "    'write_throughput_90 (ms)',\n",
    "    'write_throughput_100 (ms)',\n",
    "    'end writing time (ms)',\n",
    "    'start indexing time (ms)',\n",
    "    'end indexing time (ms)',\n",
    "    'disk usage (bytes)'\n",
    "])\n",
    "\n",
    "calc_duration(data2, 'start writing time (ms)', 'end writing time (ms)', 'writing time (ms)')\n",
    "calc_duration(data2, 'start indexing time (ms)', 'end indexing time (ms)', 'indexing time (ms)')\n",
    "scale_value(data2, 'disk usage (bytes)', 'disk usage (MB)', factor=1/1024/1024)\n",
    "\n",
    "data2['disk io (messages / s)'] = data2.apply(lambda row: row['number of messages'] / row['writing time (ms)'] * 1000, axis=1)\n",
    "data2['disk io (MB / s)'] = data2.apply(lambda row: row['number of messages'] * row['message blob size (bytes)'] / 1024 / 1024 / row['writing time (ms)'] * 1000, axis=1)\n",
    "\n",
    "throughput_abs = progress_column('write_throughput')\n",
    "throughput_duration = progress_column('write_duration')\n",
    "calc_write_progress(data2, throughput_abs, throughput_duration)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data2_aggregated = data2.groupby('description').median().sort_values(by='message blob size (bytes)')\n",
    "data2_aggregated"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Throughput\n",
    "This diagram shows how long it took to write each 10% slice of the messages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "data2_aggregated[[throughput_duration(i) for i in percent]].T.plot(kind='bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2_aggregated.plot(kind='bar', x='message blob size (bytes)', y='writing time (ms)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2_aggregated.plot(kind='bar', x='message blob size (bytes)', y='disk usage (MB)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2_aggregated.plot(kind='bar', x='message blob size (bytes)', y='disk io (messages / s)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data2_aggregated.plot(kind='bar', x='message blob size (bytes)', y='disk io (MB / s)');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3) big messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data3 = read_csv('big_messages_benchmark.csv', [\n",
    "    'description',\n",
    "    'number of messages',\n",
    "    'message blob size (bytes)',\n",
    "    'transaction size',\n",
    "    'start writing time (ms)',\n",
    "    'write_throughput_0 (ms)',\n",
    "    'write_throughput_10 (ms)',\n",
    "    'write_throughput_20 (ms)',\n",
    "    'write_throughput_30 (ms)',\n",
    "    'write_throughput_40 (ms)',\n",
    "    'write_throughput_50 (ms)',\n",
    "    'write_throughput_60 (ms)',\n",
    "    'write_throughput_70 (ms)',\n",
    "    'write_throughput_80 (ms)',\n",
    "    'write_throughput_90 (ms)',\n",
    "    'write_throughput_100 (ms)',\n",
    "    'end writing time (ms)',\n",
    "    'start indexing time (ms)',\n",
    "    'end indexing time (ms)',\n",
    "    'disk usage (bytes)'\n",
    "])\n",
    "\n",
    "calc_duration(data3, 'start writing time (ms)', 'end writing time (ms)', 'writing time (ms)')\n",
    "calc_duration(data3, 'start indexing time (ms)', 'end indexing time (ms)', 'indexing time (ms)')\n",
    "\n",
    "scale_value(data3, 'disk usage (bytes)', 'disk usage (MB)', factor=1/1024/1024)\n",
    "\n",
    "data3['disk io (messages / s)'] = data3.apply(lambda row: row['number of messages'] / row['writing time (ms)'] * 1000, axis=1)\n",
    "data3['disk io (MB / s)'] = data3.apply(lambda row: row['number of messages'] * row['message blob size (bytes)'] / 1024 / 1024 / row['writing time (ms)'] * 1000, axis=1)\n",
    "\n",
    "throughput_abs = progress_column('write_throughput')\n",
    "throughput_duration = progress_column('write_duration')\n",
    "calc_write_progress(data3, throughput_abs, throughput_duration)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data3_aggregated = data3.groupby('description').median().sort_values(by='message blob size (bytes)')\n",
    "data3_aggregated"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Throughput\n",
    "This diagram shows how long it took to write each 10% slice of the messages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data3_aggregated[[throughput_duration(i) for i in percent]].T.plot(kind='bar')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data3_aggregated.plot(kind='bar', x='message blob size (bytes)', y='writing time (ms)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data3_aggregated.plot(kind='bar', x='message blob size (bytes)', y='disk usage (MB)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data3_aggregated.plot(kind='bar', x='message blob size (bytes)', y='disk io (messages / s)');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data3_aggregated.plot(kind='bar', x='message blob size (bytes)', y='disk io (MB / s)', sort_columns=True);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4) combination of small and big messages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data4 = read_csv('mixed_messages_benchmark.csv', [\n",
    "    'description',\n",
    "    'number of small messages',\n",
    "    'small message blob size (bytes)',\n",
    "    'number of medium messages',\n",
    "    'medium message blob size (bytes)',\n",
    "    'number of big messages',\n",
    "    'big message blob size (bytes)',\n",
    "    'transaction size',\n",
    "    'start writing time (ms)',\n",
    "    'write_throughput_0 (ms)',\n",
    "    'write_throughput_10 (ms)',\n",
    "    'write_throughput_20 (ms)',\n",
    "    'write_throughput_30 (ms)',\n",
    "    'write_throughput_40 (ms)',\n",
    "    'write_throughput_50 (ms)',\n",
    "    'write_throughput_60 (ms)',\n",
    "    'write_throughput_70 (ms)',\n",
    "    'write_throughput_80 (ms)',\n",
    "    'write_throughput_90 (ms)',\n",
    "    'write_throughput_100 (ms)',\n",
    "    'end writing time (ms)',\n",
    "    'start indexing time (ms)',\n",
    "    'end indexing time (ms)',\n",
    "    'disk usage (bytes)'\n",
    "])\n",
    "\n",
    "calc_duration(data4, 'start writing time (ms)', 'end writing time (ms)', 'writing time (ms)')\n",
    "calc_duration(data4, 'start indexing time (ms)', 'end indexing time (ms)', 'indexing time (ms)')\n",
    "\n",
    "scale_value(data4, 'disk usage (bytes)', 'disk usage (MB)', factor=1/1024/1024)\n",
    "\n",
    "data4['disk io (messages / s)'] = data4.apply(lambda row: (row['number of small messages'] + row['number of medium messages'] + row['number of big messages']) / row['writing time (ms)'] * 1000, axis=1)\n",
    "data4['disk io (MB / s)'] = data4.apply(lambda row: (row['number of small messages'] * row['small message blob size (bytes)'] + row['number of medium messages'] * row['medium message blob size (bytes)'] + row['number of big messages'] * row['big message blob size (bytes)']) / 1024 / 1024 / row['writing time (ms)'] * 1000, axis=1)\n",
    "\n",
    "\n",
    "throughput_abs = progress_column('write_throughput')\n",
    "throughput_duration = progress_column('write_duration')\n",
    "calc_write_progress(data4, throughput_abs, throughput_duration)\n",
    "\n",
    "data4"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Throughput\n",
    "This diagram shows how long it took to write each 10% slice of the messages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "data4.median()[[throughput_duration(i) for i in percent]].T.plot(kind='bar')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 5) create index while or after writing file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#data5_columns = None\n",
    "#data5 = read_csv('sqlite-index-benchmark.csv', data5_columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6) huge file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#data6_columns = None\n",
    "#data6 = read_csv('sqlite-huge-file-benchmark.csv', data6_columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "..."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
